{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "793a50dc1748efb7",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "# 用每日新闻预测金融市场变化（标准版）\n",
    "\n",
    "TF-IDF + SVM 是文本分类问题的基准线"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "4ea3d65c56cdfe24",
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2025-07-22T08:17:51.350951Z",
     "start_time": "2025-07-22T08:17:50.389907Z"
    }
   },
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.metrics import roc_auc_score\n",
    "from sklearn.model_selection import train_test_split\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from datetime import date"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "1621605cd3796437",
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2025-07-22T08:17:56.335839Z",
     "start_time": "2025-07-22T08:17:56.259537Z"
    }
   },
   "outputs": [],
   "source": [
    "data = pd.read_csv('input/Combined_News_DJIA.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "9424d604-02ef-4346-a094-179627942fa3",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-07-22T08:17:59.312056Z",
     "start_time": "2025-07-22T08:17:59.293576Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": "         Date  Label                                               Top1  \\\n0  2008-08-08      0  b\"Georgia 'downs two Russian warplanes' as cou...   \n1  2008-08-11      1  b'Why wont America and Nato help us? If they w...   \n2  2008-08-12      0  b'Remember that adorable 9-year-old who sang a...   \n3  2008-08-13      0  b' U.S. refuses Israel weapons to attack Iran:...   \n4  2008-08-14      1  b'All the experts admit that we should legalis...   \n\n                                                Top2  \\\n0            b'BREAKING: Musharraf to be impeached.'   \n1        b'Bush puts foot down on Georgian conflict'   \n2                 b\"Russia 'ends Georgia operation'\"   \n3  b\"When the president ordered to attack Tskhinv...   \n4  b'War in South Osetia - 89 pictures made by a ...   \n\n                                                Top3  \\\n0  b'Russia Today: Columns of troops roll into So...   \n1  b\"Jewish Georgian minister: Thanks to Israeli ...   \n2  b'\"If we had no sexual harassment we would hav...   \n3  b' Israel clears troops who killed Reuters cam...   \n4  b'Swedish wrestler Ara Abrahamian throws away ...   \n\n                                                Top4  \\\n0  b'Russian tanks are moving towards the capital...   \n1  b'Georgian army flees in disarray as Russians ...   \n2  b\"Al-Qa'eda is losing support in Iraq because ...   \n3  b'Britain\\'s policy of being tough on drugs is...   \n4  b'Russia exaggerated the death toll in South O...   \n\n                                                Top5  \\\n0  b\"Afghan children raped with 'impunity,' U.N. ...   \n1      b\"Olympic opening ceremony fireworks 'faked'\"   \n2  b'Ceasefire in Georgia: Putin Outmaneuvers the...   \n3  b'Body of 14 year old found in trunk; Latest (...   \n4  b'Missile That Killed 9 Inside Pakistan May Ha...   \n\n                                                Top6  \\\n0  b'150 Russian tanks have entered South Ossetia...   \n1  b'What were the Mossad with fraudulent New Zea...   \n2  b'Why Microsoft and Intel tried to kill the XO...   \n3  b'China has moved 10 *million* quake survivors...   \n4  b\"Rushdie Condemns Random House's Refusal to P...   \n\n                                                Top7  \\\n0  b\"Breaking: Georgia invades South Ossetia, Rus...   \n1  b'Russia angered by Israeli military sale to G...   \n2  b'Stratfor: The Russo-Georgian War and the Bal...   \n3  b\"Bush announces Operation Get All Up In Russi...   \n4  b'Poland and US agree to missle defense deal. ...   \n\n                                                Top8  ...  \\\n0  b\"The 'enemy combatent' trials are nothing but...  ...   \n1  b'An American citizen living in S.Ossetia blam...  ...   \n2  b\"I'm Trying to Get a Sense of This Whole Geor...  ...   \n3             b'Russian forces sink Georgian ships '  ...   \n4  b'Will the Russians conquer Tblisi? Bet on it,...  ...   \n\n                                               Top16  \\\n0  b'Georgia Invades South Ossetia - if Russia ge...   \n1  b'Israel and the US behind the Georgian aggres...   \n2  b'U.S. troops still in Georgia (did you know t...   \n3                      b'Elephants extinct by 2020?'   \n4  b'Bank analyst forecast Georgian crisis 2 days...   \n\n                                               Top17  \\\n0                b'Al-Qaeda Faces Islamist Backlash'   \n1  b'\"Do not believe TV, neither Russian nor Geor...   \n2       b'Why Russias response to Georgia was right'   \n3  b'US humanitarian missions soon in Georgia - i...   \n4  b\"Georgia confict could set back Russia's US r...   \n\n                                               Top18  \\\n0  b'Condoleezza Rice: \"The US would not act to p...   \n1  b'Riots are still going on in Montreal (Canada...   \n2  b'Gorbachev accuses U.S. of making a \"serious ...   \n3             b\"Georgia's DDOS came from US sources\"   \n4  b'War in the Caucasus is as much the product o...   \n\n                                               Top19  \\\n0  b'This is a busy day:  The European Union has ...   \n1    b'China to overtake US as largest manufacturer'   \n2         b'Russia, Georgia, and NATO: Cold War Two'   \n3  b'Russian convoy heads into Georgia, violating...   \n4  b'\"Non-media\" photos of South Ossetia/Georgia ...   \n\n                                               Top20  \\\n0  b\"Georgia will withdraw 1,000 soldiers from Ir...   \n1                     b'War in South Ossetia [PICS]'   \n2  b'Remember that adorable 62-year-old who led y...   \n3  b'Israeli defence minister: US against strike ...   \n4  b'Georgian TV reporter shot by Russian sniper ...   \n\n                                               Top21  \\\n0  b'Why the Pentagon Thinks Attacking Iran is a ...   \n1  b'Israeli Physicians Group Condemns State Tort...   \n2          b'War in Georgia: The Israeli connection'   \n3                     b'Gorbachev: We Had No Choice'   \n4  b'Saudi Arabia: Mother moves to block child ma...   \n\n                                               Top22  \\\n0  b'Caucasus in crisis: Georgia invades South Os...   \n1  b' Russia has just beaten the United States ov...   \n2  b'All signs point to the US encouraging Georgi...   \n3  b'Witness: Russian forces head towards Tbilisi...   \n4   b'Taliban wages war on humanitarian aid workers'   \n\n                                               Top23  \\\n0  b'Indian shoe manufactory  - And again in a se...   \n1  b'Perhaps *the* question about the Georgia - R...   \n2  b'Christopher King argues that the US and NATO...   \n3  b' Quarter of Russians blame U.S. for conflict...   \n4  b'Russia: World  \"can forget about\" Georgia\\'s...   \n\n                                               Top24  \\\n0  b'Visitors Suffering from Mental Illnesses Ban...   \n1                 b'Russia is so much better at war'   \n2                        b'America: The New Mexico?'   \n3  b'Georgian president  says US military will ta...   \n4  b'Darfur rebels accuse Sudan of mounting major...   \n\n                                               Top25  \n0           b\"No Help for Mexico's Kidnapping Surge\"  \n1  b\"So this is what it's come to: trading sex fo...  \n2  b\"BBC NEWS | Asia-Pacific | Extinction 'by man...  \n3  b'2006: Nobel laureate Aleksander Solzhenitsyn...  \n4  b'Philippines : Peace Advocate say Muslims nee...  \n\n[5 rows x 27 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Date</th>\n      <th>Label</th>\n      <th>Top1</th>\n      <th>Top2</th>\n      <th>Top3</th>\n      <th>Top4</th>\n      <th>Top5</th>\n      <th>Top6</th>\n      <th>Top7</th>\n      <th>Top8</th>\n      <th>...</th>\n      <th>Top16</th>\n      <th>Top17</th>\n      <th>Top18</th>\n      <th>Top19</th>\n      <th>Top20</th>\n      <th>Top21</th>\n      <th>Top22</th>\n      <th>Top23</th>\n      <th>Top24</th>\n      <th>Top25</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>2008-08-08</td>\n      <td>0</td>\n      <td>b\"Georgia 'downs two Russian warplanes' as cou...</td>\n      <td>b'BREAKING: Musharraf to be impeached.'</td>\n      <td>b'Russia Today: Columns of troops roll into So...</td>\n      <td>b'Russian tanks are moving towards the capital...</td>\n      <td>b\"Afghan children raped with 'impunity,' U.N. ...</td>\n      <td>b'150 Russian tanks have entered South Ossetia...</td>\n      <td>b\"Breaking: Georgia invades South Ossetia, Rus...</td>\n      <td>b\"The 'enemy combatent' trials are nothing but...</td>\n      <td>...</td>\n      <td>b'Georgia Invades South Ossetia - if Russia ge...</td>\n      <td>b'Al-Qaeda Faces Islamist Backlash'</td>\n      <td>b'Condoleezza Rice: \"The US would not act to p...</td>\n      <td>b'This is a busy day:  The European Union has ...</td>\n      <td>b\"Georgia will withdraw 1,000 soldiers from Ir...</td>\n      <td>b'Why the Pentagon Thinks Attacking Iran is a ...</td>\n      <td>b'Caucasus in crisis: Georgia invades South Os...</td>\n      <td>b'Indian shoe manufactory  - And again in a se...</td>\n      <td>b'Visitors Suffering from Mental Illnesses Ban...</td>\n      <td>b\"No Help for Mexico's Kidnapping Surge\"</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>2008-08-11</td>\n      <td>1</td>\n      <td>b'Why wont America and Nato help us? If they w...</td>\n      <td>b'Bush puts foot down on Georgian conflict'</td>\n      <td>b\"Jewish Georgian minister: Thanks to Israeli ...</td>\n      <td>b'Georgian army flees in disarray as Russians ...</td>\n      <td>b\"Olympic opening ceremony fireworks 'faked'\"</td>\n      <td>b'What were the Mossad with fraudulent New Zea...</td>\n      <td>b'Russia angered by Israeli military sale to G...</td>\n      <td>b'An American citizen living in S.Ossetia blam...</td>\n      <td>...</td>\n      <td>b'Israel and the US behind the Georgian aggres...</td>\n      <td>b'\"Do not believe TV, neither Russian nor Geor...</td>\n      <td>b'Riots are still going on in Montreal (Canada...</td>\n      <td>b'China to overtake US as largest manufacturer'</td>\n      <td>b'War in South Ossetia [PICS]'</td>\n      <td>b'Israeli Physicians Group Condemns State Tort...</td>\n      <td>b' Russia has just beaten the United States ov...</td>\n      <td>b'Perhaps *the* question about the Georgia - R...</td>\n      <td>b'Russia is so much better at war'</td>\n      <td>b\"So this is what it's come to: trading sex fo...</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2008-08-12</td>\n      <td>0</td>\n      <td>b'Remember that adorable 9-year-old who sang a...</td>\n      <td>b\"Russia 'ends Georgia operation'\"</td>\n      <td>b'\"If we had no sexual harassment we would hav...</td>\n      <td>b\"Al-Qa'eda is losing support in Iraq because ...</td>\n      <td>b'Ceasefire in Georgia: Putin Outmaneuvers the...</td>\n      <td>b'Why Microsoft and Intel tried to kill the XO...</td>\n      <td>b'Stratfor: The Russo-Georgian War and the Bal...</td>\n      <td>b\"I'm Trying to Get a Sense of This Whole Geor...</td>\n      <td>...</td>\n      <td>b'U.S. troops still in Georgia (did you know t...</td>\n      <td>b'Why Russias response to Georgia was right'</td>\n      <td>b'Gorbachev accuses U.S. of making a \"serious ...</td>\n      <td>b'Russia, Georgia, and NATO: Cold War Two'</td>\n      <td>b'Remember that adorable 62-year-old who led y...</td>\n      <td>b'War in Georgia: The Israeli connection'</td>\n      <td>b'All signs point to the US encouraging Georgi...</td>\n      <td>b'Christopher King argues that the US and NATO...</td>\n      <td>b'America: The New Mexico?'</td>\n      <td>b\"BBC NEWS | Asia-Pacific | Extinction 'by man...</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>2008-08-13</td>\n      <td>0</td>\n      <td>b' U.S. refuses Israel weapons to attack Iran:...</td>\n      <td>b\"When the president ordered to attack Tskhinv...</td>\n      <td>b' Israel clears troops who killed Reuters cam...</td>\n      <td>b'Britain\\'s policy of being tough on drugs is...</td>\n      <td>b'Body of 14 year old found in trunk; Latest (...</td>\n      <td>b'China has moved 10 *million* quake survivors...</td>\n      <td>b\"Bush announces Operation Get All Up In Russi...</td>\n      <td>b'Russian forces sink Georgian ships '</td>\n      <td>...</td>\n      <td>b'Elephants extinct by 2020?'</td>\n      <td>b'US humanitarian missions soon in Georgia - i...</td>\n      <td>b\"Georgia's DDOS came from US sources\"</td>\n      <td>b'Russian convoy heads into Georgia, violating...</td>\n      <td>b'Israeli defence minister: US against strike ...</td>\n      <td>b'Gorbachev: We Had No Choice'</td>\n      <td>b'Witness: Russian forces head towards Tbilisi...</td>\n      <td>b' Quarter of Russians blame U.S. for conflict...</td>\n      <td>b'Georgian president  says US military will ta...</td>\n      <td>b'2006: Nobel laureate Aleksander Solzhenitsyn...</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>2008-08-14</td>\n      <td>1</td>\n      <td>b'All the experts admit that we should legalis...</td>\n      <td>b'War in South Osetia - 89 pictures made by a ...</td>\n      <td>b'Swedish wrestler Ara Abrahamian throws away ...</td>\n      <td>b'Russia exaggerated the death toll in South O...</td>\n      <td>b'Missile That Killed 9 Inside Pakistan May Ha...</td>\n      <td>b\"Rushdie Condemns Random House's Refusal to P...</td>\n      <td>b'Poland and US agree to missle defense deal. ...</td>\n      <td>b'Will the Russians conquer Tblisi? Bet on it,...</td>\n      <td>...</td>\n      <td>b'Bank analyst forecast Georgian crisis 2 days...</td>\n      <td>b\"Georgia confict could set back Russia's US r...</td>\n      <td>b'War in the Caucasus is as much the product o...</td>\n      <td>b'\"Non-media\" photos of South Ossetia/Georgia ...</td>\n      <td>b'Georgian TV reporter shot by Russian sniper ...</td>\n      <td>b'Saudi Arabia: Mother moves to block child ma...</td>\n      <td>b'Taliban wages war on humanitarian aid workers'</td>\n      <td>b'Russia: World  \"can forget about\" Georgia\\'s...</td>\n      <td>b'Darfur rebels accuse Sudan of mounting major...</td>\n      <td>b'Philippines : Peace Advocate say Muslims nee...</td>\n    </tr>\n  </tbody>\n</table>\n<p>5 rows × 27 columns</p>\n</div>"
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "b714bb5a-1432-4242-b8cb-fdd7acf51124",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-07-22T08:18:01.639344Z",
     "start_time": "2025-07-22T08:18:01.629488Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 1989 entries, 0 to 1988\n",
      "Data columns (total 27 columns):\n",
      " #   Column  Non-Null Count  Dtype \n",
      "---  ------  --------------  ----- \n",
      " 0   Date    1989 non-null   object\n",
      " 1   Label   1989 non-null   int64 \n",
      " 2   Top1    1989 non-null   object\n",
      " 3   Top2    1989 non-null   object\n",
      " 4   Top3    1989 non-null   object\n",
      " 5   Top4    1989 non-null   object\n",
      " 6   Top5    1989 non-null   object\n",
      " 7   Top6    1989 non-null   object\n",
      " 8   Top7    1989 non-null   object\n",
      " 9   Top8    1989 non-null   object\n",
      " 10  Top9    1989 non-null   object\n",
      " 11  Top10   1989 non-null   object\n",
      " 12  Top11   1989 non-null   object\n",
      " 13  Top12   1989 non-null   object\n",
      " 14  Top13   1989 non-null   object\n",
      " 15  Top14   1989 non-null   object\n",
      " 16  Top15   1989 non-null   object\n",
      " 17  Top16   1989 non-null   object\n",
      " 18  Top17   1989 non-null   object\n",
      " 19  Top18   1989 non-null   object\n",
      " 20  Top19   1989 non-null   object\n",
      " 21  Top20   1989 non-null   object\n",
      " 22  Top21   1989 non-null   object\n",
      " 23  Top22   1989 non-null   object\n",
      " 24  Top23   1988 non-null   object\n",
      " 25  Top24   1986 non-null   object\n",
      " 26  Top25   1986 non-null   object\n",
      "dtypes: int64(1), object(26)\n",
      "memory usage: 419.7+ KB\n"
     ]
    }
   ],
   "source": [
    "data.info()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "13e5cfbc-108c-4b31-9f8e-07b1b20a2c3a",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-07-22T08:18:03.647533Z",
     "start_time": "2025-07-22T08:18:03.638483Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": "             Label\ncount  1989.000000\nmean      0.535445\nstd       0.498867\nmin       0.000000\n25%       0.000000\n50%       1.000000\n75%       1.000000\nmax       1.000000",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Label</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>count</th>\n      <td>1989.000000</td>\n    </tr>\n    <tr>\n      <th>mean</th>\n      <td>0.535445</td>\n    </tr>\n    <tr>\n      <th>std</th>\n      <td>0.498867</td>\n    </tr>\n    <tr>\n      <th>min</th>\n      <td>0.000000</td>\n    </tr>\n    <tr>\n      <th>25%</th>\n      <td>0.000000</td>\n    </tr>\n    <tr>\n      <th>50%</th>\n      <td>1.000000</td>\n    </tr>\n    <tr>\n      <th>75%</th>\n      <td>1.000000</td>\n    </tr>\n    <tr>\n      <th>max</th>\n      <td>1.000000</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "66659c32-3a7d-4262-8034-ce940e8f7edc",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-07-22T08:18:06.117093Z",
     "start_time": "2025-07-22T08:18:06.112553Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": "(1989, 27)"
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "b2ccbe5b-1cdc-4e58-bf7b-c8c5bc5ec048",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-07-22T08:18:07.745583Z",
     "start_time": "2025-07-22T08:18:07.526489Z"
    }
   },
   "outputs": [],
   "source": [
    "data['combined_news'] = data.filter(regex=('Top.*')).apply(lambda x: ''.join(str(x.values)),axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "cc61fe09-f02d-42b7-a9d5-1dc979e0c84f",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-07-22T08:18:09.615156Z",
     "start_time": "2025-07-22T08:18:09.607177Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": "0       ['b\"Georgia \\'downs two Russian warplanes\\' as...\n1       [\"b'Why wont America and Nato help us? If they...\n2       [\"b'Remember that adorable 9-year-old who sang...\n3       [\"b' U.S. refuses Israel weapons to attack Ira...\n4       [\"b'All the experts admit that we should legal...\n                              ...                        \n1984    ['Barclays and RBS shares suspended from tradi...\n1985    ['2,500 Scientists To Australia: If You Want T...\n1986    ['Explosion At Airport In Istanbul'\\n 'Yemeni ...\n1987    ['Jamaica proposes marijuana dispensers for to...\n1988    ['A 117-year-old woman in Mexico City finally ...\nName: combined_news, Length: 1989, dtype: object"
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data['combined_news']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "f8c4b355-3efa-4d17-95a3-da819d2a6758",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-07-22T08:18:44.378774Z",
     "start_time": "2025-07-22T08:18:43.666558Z"
    }
   },
   "outputs": [],
   "source": [
    "feature_extraction = TfidfVectorizer(stop_words='english')\n",
    "X = feature_extraction.fit_transform(data['combined_news'])  # Fit and transform on full data\n",
    "y = data['Label']\n",
    "\n",
    "# Train-test split\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "1e765bf4-45f1-469a-9f56-f088bb98febc",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-07-22T08:19:04.676393Z",
     "start_time": "2025-07-22T08:19:04.673263Z"
    }
   },
   "outputs": [],
   "source": [
    "clf = SVC(probability=True,kernel='rbf')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "ac3e1143-a3ad-4525-94d9-461d12e1d2f9",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-07-22T08:19:33.658918Z",
     "start_time": "2025-07-22T08:19:07.343626Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": "SVC(probability=True)",
      "text/html": "<style>#sk-container-id-1 {color: black;background-color: white;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>SVC(probability=True)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">SVC</label><div class=\"sk-toggleable__content\"><pre>SVC(probability=True)</pre></div></div></div></div></div>"
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf.fit(X_train,y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "02dc9d4a-c88a-4163-9f2e-eeddfca3cd4f",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-07-22T08:22:26.786189Z",
     "start_time": "2025-07-22T08:22:25.420205Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ROC-AUC yiedls0.5185614550325889\n"
     ]
    }
   ],
   "source": [
    "predictions = clf.predict_proba(X_test)\n",
    "print('ROC-AUC yiedls: '+ str(roc_auc_score(y_test,predictions[:,1])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "f71e1a8c-1a7a-4a73-b1ce-839e180687d1",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-07-22T08:23:30.547101Z",
     "start_time": "2025-07-22T08:23:29.834884Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ROC-AUC yields: 0.47265373418862877\n"
     ]
    }
   ],
   "source": [
    "from sklearn.svm import LinearSVC\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(data['combined_news'], y, test_size=0.2, random_state=42)\n",
    "\n",
    "tfidf = TfidfVectorizer(stop_words='english')\n",
    "X_train_tfidf = tfidf.fit_transform(X_train)\n",
    "X_test_tfidf = tfidf.transform(X_test)\n",
    "\n",
    "# Train and predict\n",
    "clf = LinearSVC()\n",
    "clf.fit(X_train_tfidf, y_train)\n",
    "y_scores = clf.decision_function(X_test_tfidf)  # 使用 transform 后的数据\n",
    "print('ROC-AUC yields: ' + str(roc_auc_score(y_test, y_scores)))"
   ]
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "# 如何去掉区分度不高的词（stopword、number)\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "import re   \n",
    "\n",
    "stop = stopwords.words('english')\n",
    "wordnet = WordNetLemmatizer()\n",
    "def hasNumber(inputString):\n",
    "    return bool(re.findall(r'\\d', inputString))\n",
    "\n",
    "def check(word):\n",
    "    if word in stop: return False \n",
    "    elif hasNumber(word):\n",
    "        return False\n",
    "    else:\n",
    "        return True\n",
    "\n",
    "X_train = X_train.apply(lambda x: [wordnet.lemmatize(item) for item in x if check(item)])\n",
    "X_test = X_test.apply(lambda x: [wordnet.lemmatize(item) for item in x if check(item)])\n",
    "#　sklearn 只支持string输入，所以我们把调整后的list再变回string\n",
    "X_train = X_train.apply(lambda x: ' '.join(x))\n",
    "X_test = X_test.apply(lambda x: ' '.join(x))"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "af24b91f4f9366ba"
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:base] *",
   "language": "python",
   "name": "conda-base-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
